In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.utils import np_utils
from keras.datasets import mnist
# for Multi-layer Perceptron (MLP) model
from keras.models import Sequential
from keras.layers import Dense
# for Convolutional Neural Network (CNN) model
from keras.layers import Dropout, Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
# fix for issue: https://github.com/fchollet/keras/issues/2681
from keras import backend as K
K.set_image_dim_ordering('th')
In [3]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
In [4]:
plt.figure(figsize=[20,8])
for i in range(6):
plt.subplot(1,6,i+1)
#plt.imshow(X_train[i])
plt.imshow(X_train[i], cmap='gray', interpolation='none')
plt.title("Class {}".format(y_train[i]))
In [5]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
In [6]:
print (X_train.shape)
print (y_train.shape)
In [7]:
# print first train image values
# it contains a matrix of 28 rows and 28 cols
print (X_train[0])
In [8]:
# flatten 28*28 images to a 784 vector for each image
num_pixels = X_train.shape[1] * X_train.shape[2]
X_train = X_train.reshape(X_train.shape[0], num_pixels).astype('float32')
X_test = X_test.reshape(X_test.shape[0], num_pixels).astype('float32')
print (num_pixels, X_train.shape, X_test.shape)
In [9]:
print (X_train[1])
In [10]:
# pixel values are gray scale between 0 and 255
# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255
print (X_train[1])
The output/target variable is in the format 0 to 9. As this is a multi-class classification problem, we convert the output class values into one-hot format which is simply a binary matrix, i.e.
value 0 will be converted to one-hot format as [1, 0, 0, 0, 0, 0, 0, 0, 0]
value 1 will be converted to one-hot format as [0, 1, 0, 0, 0, 0, 0, 0, 0]
value 2 will be converted to one-hot format as [0, 0, 1, 0, 0, 0, 0, 0, 0]
and so on...
In [11]:
print (y_train.shape)
print (y_train[0])
In [12]:
# one hot encode outputs
# note that we have new variables with capital Y
# Y_train is different than y_train
Y_train = np_utils.to_categorical(y_train)
Y_test = np_utils.to_categorical(y_test)
num_classes = Y_test.shape[1]
In [13]:
print (y_train.shape, Y_train.shape)
print (y_train[0], Y_train[0])
Generally, neural networks have the following properties:
A single-layer perceptron model is the simplest kind of neural network where there are only two layers: input layer and output layer. The inputs are directly fed into the outputs via a series of weights. It's a feed-forward network where the information moves in only one direction, i.e. forward direction from input nodes to output nodes.
A multi-layer perceptron model is the other kind of neural network where there are one or more hidden layers in between input and output layers. The information flows from input layer to hidden layers and then to output layers. These models can be of feed-forward type or they can also use back-propagation method. In back-propagation, the error is calculated in the output layer by computing the difference of actual output and predicted output. The error is then distributed back to the network layers. Based on this error, the algorithm will adjust the weights of each connection in order to reduce the error value. This type of learning is also referred as deep learning.
We create a simple neural network model with one hidden layer with 784 neurons. Our input layer will also have 784 neurons as we have flattened out training dataset into a single 784 dimensional vector.
softmax activation is used in the output layer.
adam gradient descent optimizer is used to learn weights.
In [14]:
def baseline_model():
# create model
model = Sequential()
model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'))
model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
The model is fit over 5 epochs/iteration. It takes a batch of 200 images in each iteration. Test data is used as validation set. The epochs may be increased to improve accuracy.
Finally, test data is used to evaluate the model by calculating the model's classification accuracy.
In [15]:
model = baseline_model()
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=5, batch_size=200, verbose=1)
Out[15]:
In [16]:
model.summary()
In [17]:
scores = model.evaluate(X_test, Y_test, verbose=0)
print (scores)
print ('Score: {}'.format(scores[0]))
print ('Accuracy: {}'.format(scores[1]))
In [18]:
# get predicted values
predicted_classes = model.predict_classes(X_test)
In [19]:
# get index list of all correctly predicted values
correct_indices = np.nonzero(np.equal(predicted_classes, y_test))[0]
# get index list of all incorrectly predicted values
incorrect_indices = np.nonzero(np.not_equal(predicted_classes, y_test))[0]
In [20]:
print ('Correctly predicted: %i' % np.size(correct_indices))
print ('Incorrectly predicted: %i' % np.size(incorrect_indices))
In [21]:
plt.figure(figsize=[20,8])
for i, correct in enumerate(correct_indices[:6]):
plt.subplot(1,6,i+1)
plt.imshow(X_test[correct].reshape(28,28), cmap='gray', interpolation='none')
plt.title("Predicted {}, Class {}".format(predicted_classes[correct], y_test[correct]))
plt.figure(figsize=[20,8])
for i, incorrect in enumerate(incorrect_indices[:6]):
plt.subplot(1,6,i+1)
plt.imshow(X_test[incorrect].reshape(28,28), cmap='gray', interpolation='none')
plt.title("Predicted {}, Class {}".format(predicted_classes[incorrect], y_test[incorrect]))
In [22]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
sns.set() # setting seaborn default for plots
class_names = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predicted_classes)
np.set_printoptions(precision=2)
print ('Confusion Matrix in Numbers')
print (cnf_matrix)
print ('')
cnf_matrix_percent = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
print ('Confusion Matrix in Percentage')
print (cnf_matrix_percent)
print ('')
true_class_names = class_names
predicted_class_names = class_names
df_cnf_matrix = pd.DataFrame(cnf_matrix,
index = true_class_names,
columns = predicted_class_names)
df_cnf_matrix_percent = pd.DataFrame(cnf_matrix_percent,
index = true_class_names,
columns = predicted_class_names)
plt.figure(figsize = (8,6))
#plt.subplot(121)
ax = sns.heatmap(df_cnf_matrix, annot=True, fmt='d')
ax.set_ylabel('True values')
ax.set_xlabel('Predicted values')
ax.set_title('Confusion Matrix in Numbers')
'''
plt.subplot(122)
ax = sns.heatmap(df_cnf_matrix_percent, annot=True)
ax.set_ylabel('True values')
ax.set_xlabel('Predicted values')
'''
Out[22]:
The above confusion matrix heatmap shows that:
Convolutional Neural Networks (CNN) are similar to Multi-layer Perceptron Neural Networks. They are also made up of neurons that have learnable weights and biases. CNNs have been successfully applied to analyzing visual imagery. They are mostly being applied in image and video recognition, recommender systems and natural language processing.
A CNN consists of multiple hidden layers. The hidden layers are either convolutional, pooling or fully connected.
Convolution layer: Feature extraction is done in this layer. This layer applies convolution operation to the input and pass the result to the next layer. In the image classification problem, a weight matrix is defined in the convolution layer. A dot product is computed between the weight matrix and a small part (as the size of the weight matrix) of the input image. The weight runs across the image such that all the pixels are covered at least once, to give a convolved output.
The weight matrix behaves like a filter in an image extracting particular information from the original image matrix.
A weight combination might be extracting edges, while another one might a particular color, while another one might just blur the unwanted noise.
The weights are learnt such that the loss function is minimized similar to a Multi-layer Perceptron.
Therefore weights are learnt to extract features from the original image which help the network in correct prediction.
When we have multiple convolutional layers, the initial layer extract more generic features, while as the network gets deeper, the features extracted by the weight matrices are more and more complex and more suited to the problem at hand.
Reference: Architecture of Convolutional Neural Networks (CNNs) demystified
Stride: While computing the dot product, if the weight matrix moves 1 pixel at a time then we call it a stride of 1. Size of the image keeps on reducing as we increase the stride value.
Padding: Padding one or more layer of zeros across the image helps to resolve the output image size reduction issue caused by stride. Initial size of the image is retained after the padding is done.
Pooling layer: Reduction in number of feature parameters is done in this layer. When the image size is too larger, then we need a pooling layer in-between two convolution layers. This layer helps to reduce the number of trainable parameters of the input image. The sole purpose of pooling is to reduce the spatial size of the image. This layer is also used to control overfitting.
Fully connected layer: This layer comes after convolution and pooling layers. This layer connects each neuron in one layer to every neuron in another layer. This is similar to the concept of layer connection of Multi-layer perceptron model. Error is computed in the output layer by computing the difference in actual output and predicted output. After that, back-propagation is used to update the weight and biases for error and loss reduction.
In [24]:
# load data
(X_train, y_train), (X_test, y_test) = mnist.load_data()
The image dimension expected by Keras for 2D (two-dimensional) convolution is in the format of [pixels][width][height].
For RGB color image, the first dimension (pixel) value would be 3 for the red, green and blue components. It's like having 3 image inputs for every single color image. In our case (for MNIST handwritten images), we have gray scale images. Hence, the pixel dimension is set as 1.
In [25]:
# reshape to be [samples][pixels][width][height]
X_train = X_train.reshape(X_train.shape[0], 1, 28, 28).astype('float32')
X_test = X_test.reshape(X_test.shape[0], 1, 28, 28).astype('float32')
print (num_pixels, X_train.shape, X_test.shape)
In [26]:
print (X_train[1])
In [27]:
# pixel values are gray scale between 0 and 255
# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255
print (X_train[1])
The output/target variable is in the format 0 to 9. As this is a multi-class classification problem, we convert the output class values into one-hot format which is simply a binary matrix, i.e.
value 0 will be converted to one-hot format as [1, 0, 0, 0, 0, 0, 0, 0, 0]
value 1 will be converted to one-hot format as [0, 1, 0, 0, 0, 0, 0, 0, 0]
value 2 will be converted to one-hot format as [0, 0, 1, 0, 0, 0, 0, 0, 0]
and so on...
In [28]:
print (y_train.shape)
print (y_train[0])
In [29]:
# one hot encode outputs
# note that we have new variables with capital Y
# Y_train is different than y_train
Y_train = np_utils.to_categorical(y_train)
Y_test = np_utils.to_categorical(y_test)
num_classes = Y_test.shape[1]
In [30]:
print (y_train.shape, Y_train.shape)
print (y_train[0], Y_train[0])
Convolution Layer
Max Pooling Layer
Dropout Layer
Flatten
Fully connected Layer
In [33]:
# baseline model for CNN
def baseline_model():
# create model
model = Sequential()
model.add(Conv2D(32, (5, 5), input_shape=(1, 28, 28), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
The model is fit over 5 epochs/iteration. It takes a batch of 200 images in each iteration. Test data is used as validation set. The epochs may be increased to improve accuracy.
Finally, test data is used to evaluate the model by calculating the model's classification accuracy.
In [35]:
model = baseline_model()
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=5, batch_size=200, verbose=1)
Out[35]:
In [38]:
model.summary()
In [36]:
scores = model.evaluate(X_test, Y_test, verbose=0)
print (scores)
print ('Score: {}'.format(scores[0]))
print ('Accuracy: {}'.format(scores[1]))
Accuracy (98.75%) of Convolution Neural Network (CNN) model has improved as compared to the accuracy (97.91%) of Multi-layer Perceptron (MLP) model.
The accuracy of CNN model can be further increased by:
In [37]:
# get predicted values
predicted_classes = model.predict_classes(X_test)
In [39]:
# get index list of all correctly predicted values
correct_indices = np.nonzero(np.equal(predicted_classes, y_test))[0]
# get index list of all incorrectly predicted values
incorrect_indices = np.nonzero(np.not_equal(predicted_classes, y_test))[0]
In [40]:
print ('Correctly predicted: %i' % np.size(correct_indices))
print ('Incorrectly predicted: %i' % np.size(incorrect_indices))
In [41]:
plt.figure(figsize=[20,8])
for i, correct in enumerate(correct_indices[:6]):
plt.subplot(1,6,i+1)
plt.imshow(X_test[correct].reshape(28,28), cmap='gray', interpolation='none')
plt.title("Predicted {}, Class {}".format(predicted_classes[correct], y_test[correct]))
plt.figure(figsize=[20,8])
for i, incorrect in enumerate(incorrect_indices[:6]):
plt.subplot(1,6,i+1)
plt.imshow(X_test[incorrect].reshape(28,28), cmap='gray', interpolation='none')
plt.title("Predicted {}, Class {}".format(predicted_classes[incorrect], y_test[incorrect]))
In [42]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
sns.set() # setting seaborn default for plots
class_names = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predicted_classes)
np.set_printoptions(precision=2)
print ('Confusion Matrix in Numbers')
print (cnf_matrix)
print ('')
cnf_matrix_percent = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
print ('Confusion Matrix in Percentage')
print (cnf_matrix_percent)
print ('')
true_class_names = class_names
predicted_class_names = class_names
df_cnf_matrix = pd.DataFrame(cnf_matrix,
index = true_class_names,
columns = predicted_class_names)
df_cnf_matrix_percent = pd.DataFrame(cnf_matrix_percent,
index = true_class_names,
columns = predicted_class_names)
plt.figure(figsize = (8,6))
#plt.subplot(121)
ax = sns.heatmap(df_cnf_matrix, annot=True, fmt='d')
ax.set_ylabel('True values')
ax.set_xlabel('Predicted values')
ax.set_title('Confusion Matrix in Numbers')
'''
plt.subplot(122)
ax = sns.heatmap(df_cnf_matrix_percent, annot=True)
ax.set_ylabel('True values')
ax.set_xlabel('Predicted values')
'''
Out[42]:
Using Multi-layer Perceptron (MLP) Model, we had the following heatmap outcome:
Using Convolutional Neural Network (CNN) Model, we had the following improvements:
The accuracy of CNN model can be further increased by:
In [ ]:
submissions = pd.DataFrame({'ImageId':list(range(1,len(predicted_classes) + 1)), "Label": predicted_classes})
#submissions.to_csv("submission.csv", index=False, header=True)